Borough Relationship to Crime

setwd("/Volumes/FactoryUsers/Users/bdaniel/Dropbox/Columbia Video Network/2018 a Data Visualization/CrimeDataNYC/CrimeData")

library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
crime_df <- fread("NYPD_Complaint_Data_Historic.csv",na.strings="")
## 
Read 0.0% of 5580035 rows
Read 10.9% of 5580035 rows
Read 21.0% of 5580035 rows
Read 31.2% of 5580035 rows
Read 41.6% of 5580035 rows
Read 53.0% of 5580035 rows
Read 64.5% of 5580035 rows
Read 76.2% of 5580035 rows
Read 89.2% of 5580035 rows
Read 5580035 rows and 24 (of 24) columns from 1.329 GB file in 00:00:14
crime_df%>%mutate(CMPLNT_FR_DT=as.Date(CMPLNT_FR_DT,format='%m/%d/%Y'),
                  CMPLNT_TO_DT=as.Date(CMPLNT_TO_DT,format='%m/%d/%Y'),
                  RPT_DT=as.Date(RPT_DT,format='%m/%d/%Y'))->df
crime_df<-df

var_names <- c("Id", "DateStart", "TimeStart", "DateEnd", "TimeEnd", "DateReport", "ClassCode", "OffenseDesc", 
               "IntClassCode", "IntOffenseDesc", "AtptCptdStatus", "Level", "Jurisdiction", "Boro", "Pct", "LocOfOccr", "PremDesc", 
               "ParkName", "HousingDevName", "XCoord", "YCoord", "Lat", "Long", "Lat_Long")

colnames(crime_df)<-var_names

crime_df%>%mutate_if(is.character,funs(factor(.)))->crime_df

setwd("/Volumes/FactoryUsers/Users/bdaniel/Dropbox/Columbia Video Network/2018 a Data Visualization/CrimeDataNYC")

#bring in weather data first
weather_select = c("DATE", "AWND", "PRCP", "SNOW", "TMAX")
weather_data <- fread("Data_Files/nyc_weather_data.csv", na.strings="", select = weather_select, stringsAsFactors = FALSE)

# bring in Borough Population and massage it
bdf <- fread("Data_Files/BoroughPop.csv")
bdf <- bdf[1:6,]
bdf$Boro <- c("TOTAL","BRONX","BROOKLYN","MANHATTAN","QUEENS","STATEN ISLAND")


# summarize for mosaic, per capita plots
df_bsum <-crime_df %>% 
  filter(!is.na(Boro)) %>%
  group_by(Boro,Level) %>% 
  summarize(Freq = n())

# merge in the borough population
df_bsum <- merge(df_bsum, bdf, by="Boro")

# per capita calculation
df_bsum$PerCap <-df_bsum$Freq/df_bsum$`2016 Estimate`

library(grid)
library(vcd)

That as prep for the first plot.

# Mosaic By Count
mosaic(Level~Boro,df_bsum, direction=c("v","h"), main="Crime by Borough by Level")

This shows crime totals by category, showing how there are fewer crimes in Staten Island than, say, Brooklyn

# By Per Capita -- you have to have "Freq" be the column for the thing the Mosaic will use for frequency, so 
# for Per Capita, you need to swap the Freq column names
colnames(df_bsum)[colnames(df_bsum)=="Freq"] <- "Count"
colnames(df_bsum)[colnames(df_bsum)=="PerCap"] <- "Freq"
mosaic(Level~Boro,df_bsum, direction=c("v","h"), main="Crime per Capita by Borough by Level")

Here, we see the Brooklyn’s crime rate per Capita is not the highest (width of bars). Seem like it is the Bronx, then Manhattan. We can also see the Queens has the lowest Per Capita.

Type of Crime, per capita, doesn’t have that large a variation. I have more to look into here.

Rich suggested we check to see if, by limiting to just the year(s) of the population figures, we see much difference. I wouldn’t say we do.

#need to rename the bdf Boro in order to make the merge work
colnames(bdf)[colnames(bdf)=="Boro"] <- "Boro"

# limit to specific years of the population data and test
# start with 2010
# summarize for mosaic, per capita plots
df_bsum2010 <-crime_df %>% 
  filter(!is.na(Boro)) %>%
  filter(DateStart > "2009-12-31" & DateStart < "2011-01-01") %>%
  group_by(Boro,Level) %>% 
  summarize(Freq = n())

# merge in the borough population
df_bsum2010 <- merge(df_bsum2010, bdf, by="Boro")


# per capita calculation 
df_bsum2010$PerCap <-df_bsum2010$Freq/df_bsum2010$`2010 Population`

#2010 mosaic
colnames(df_bsum2010)[colnames(df_bsum2010)=="Freq"] <- "Count"
colnames(df_bsum2010)[colnames(df_bsum2010)=="PerCap"] <- "Freq"
mosaic(Level~Boro,df_bsum2010, direction=c("v","h"), main="2010 Crime per Capita by Borough by Level")

# now 2016 Estimate
# summarize for mosaic, per capita plots
df_bsum2016 <-crime_df %>% 
  filter(!is.na(Boro)) %>%
  filter(DateStart > "2015-12-31" & DateStart < "2017-01-01") %>%
  group_by(Boro,Level) %>% 
  summarize(Freq = n())

# merge in the borough population
df_bsum2016 <- merge(df_bsum2016, bdf, by="Boro")

# per capita calculation
df_bsum2016$PerCap <-df_bsum2016$Freq/df_bsum2016$`2016 Estimate`


# By Per Capita -- you have to have "Freq" be the column for the thing the Mosaic will use for frequency, so 
# for Per Capita, you need to swap the Freq column names


#2016
colnames(df_bsum2016)[colnames(df_bsum2016)=="Freq"] <- "Count"
colnames(df_bsum2016)[colnames(df_bsum2016)=="PerCap"] <- "Freq"
mosaic(Level~Boro,df_bsum2016, direction=c("v","h"), main="2016 Crime per Capita by Borough by Level")

Then, let’s compare those elements of the Mosaic a little more directly to see changes in per capita crime rate over time.

#Plot 2010 year over 2016 year by Borough
colnames(df_bsum2010)[colnames(df_bsum2010)=="Freq"] <- "PerCap10"
colnames(df_bsum2016)[colnames(df_bsum2016)=="Freq"] <- "PerCap16"
df_bsum.pcap <- merge(df_bsum2010,df_bsum2016, by=c("Boro","Level"))
df_bsum.pcap$Count.y <- NULL
df_bsum.pcap$Borough.y <- NULL
df_bsum.pcap$"2010 Population.y" <- NULL
df_bsum.pcap$"2016 Estimate.y" <- NULL

tidy_bsum <- tidyr::gather(df_bsum.pcap, key="Year", value="PerCap", -"Boro", -"Level", -"Count.x", -"2010 Population.x", -"2016 Estimate.x", -"Borough.x")

library(ggplot2)
ggplot(tidy_bsum, aes(x=Year, y=PerCap, fill=Level))+
  geom_bar(stat="identity",position="dodge") +
  scale_fill_discrete(name="Year",
  #breaks=c(1, 2),
  labels=c("Felony", "Misdemeanor","Violation")) +
  xlab("Year")+ylab("Per Capita Crime") +
  facet_wrap(~Boro) +
  ggtitle("Per Capita Crime Rates by Level by Borough by Time") 

These graphs show how there is an apparent drop in the rate of crime between 2010 and 2016, mostly driven by Misdemenaors (in every Borough, but most predominantly in the Bronx). We can see that the Felony rate has been mostly unchanged, except in Manhattan. Violations have gone up in every Borough except Staten Island. (Perhaps lending to my theory that Violations are a function of available police hours to write tickets.)

Crime by level vs. Temperature

Setting up the data as Rich did..

#Read in weather data from file -- per Rich
weather_data$DATE <- as.Date(weather_data$DATE)
weather_data$AWND <- as.numeric(weather_data$AWND)
weather_data$PRCP <- as.numeric(weather_data$PRCP)
weather_data$SNOW <- as.numeric(weather_data$SNOW)
weather_data$TMAX <- as.numeric(weather_data$TMAX)

weather_data$DateStart <- as.Date(weather_data$DATE)

#Merge the data together -- per Rich
crime_df <- merge(crime_df,weather_data,by="DateStart")

Relationship between Crime and Temperature

### Relationship between max temp and crime volume
# set up the data by day and Level
daily_df <-crime_df %>% group_by(DateStart,Level) %>% summarize(CrimeCount=n(),MaxTemp=mean(TMAX))
# plot it -- well, plot it later after the linear models are run so we can see the linear slopes
library(ggplot2)
# daily_df %>% ggplot(aes(x=MaxTemp, y=CrimeCount, color=Level)) + geom_point()
# linear model: Felonies
f_df <- daily_df %>% filter(Level=="FELONY")
flm <- lm(CrimeCount~MaxTemp, f_df)
# linear model: Misdemeanors
m_df <- daily_df %>% filter(Level=="MISDEMEANOR")
mlm <- lm(CrimeCount~MaxTemp, m_df)
# linear model: Violation
v_df <- daily_df %>% filter(Level=="VIOLATION")
vlm <- lm(CrimeCount~MaxTemp, v_df)

Let’s check the normality of the daily counts

# see shape of the daily counts... normal?
daily_df %>% ggplot(aes(CrimeCount)) +
  geom_histogram() +
  facet_wrap(~Level) +
  ggtitle("Histograms of Daily Crime Count by Level of Crime")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(daily_df, aes(x=CrimeCount)) +
  geom_density(aes(group=Level, color=Level, fill=Level), alpha=0.3) +
  ggtitle("Density Curves of Daily Crime Count by Level of Crime")

qqnorm(f_df$CrimeCount, main="Normal Q-Q Plot, Daily Felony Count", col="#F8766D") 

qqnorm(m_df$CrimeCount, main="Normal Q-Q Plot, Daily Misdemeanor Count", col="#00BA38") 

qqnorm(v_df$CrimeCount, main="Normal Q-Q Plot, Daily Violation Count", col="#619CFF") 

Scatterplot with linear model lines

#replot the scatterplot with linear results
ggplot(daily_df, aes(x=MaxTemp, y=CrimeCount, color=Level)) + 
  geom_point(alpha=0.5) +
  geom_abline(slope=flm[["coefficients"]][["MaxTemp"]],intercept=flm[["coefficients"]][["(Intercept)"]]) +
  annotate("text", x= 25, y=450, label=paste0("y=",round(flm[["coefficients"]][["MaxTemp"]],2),"x+",round(flm[["coefficients"]][["(Intercept)"]]),0)) +
  geom_abline(slope=mlm[["coefficients"]][["MaxTemp"]],intercept=mlm[["coefficients"]][["(Intercept)"]]) +
  annotate("text", x= 25, y=770, label=paste0("y=",round(mlm[["coefficients"]][["MaxTemp"]],2),"x+",round(mlm[["coefficients"]][["(Intercept)"]]),0)) +
  geom_abline(slope=vlm[["coefficients"]][["MaxTemp"]],intercept=vlm[["coefficients"]][["(Intercept)"]]) +
  annotate("text", x= 25, y=200, label=paste0("y=",round(vlm[["coefficients"]][["MaxTemp"]],2),"x+",round(vlm[["coefficients"]][["(Intercept)"]]),0)) +
  ggtitle("Daily Crime Counts vs. Temperature by Level of Crime with Linear Models")

Summaries of linear models

# print summaries
summary(flm)
## 
## Call:
## lm(formula = CrimeCount ~ MaxTemp, data = f_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -218.51  -36.57   -4.58   31.43  470.74 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 348.99926    3.31188  105.38   <2e-16 ***
## MaxTemp       1.19673    0.05035   23.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57.97 on 4016 degrees of freedom
## Multiple R-squared:  0.1233, Adjusted R-squared:  0.1231 
## F-statistic: 564.9 on 1 and 4016 DF,  p-value: < 2.2e-16
summary(mlm)
## 
## Call:
## lm(formula = CrimeCount ~ MaxTemp, data = m_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -496.09  -66.82   -2.56   65.97  735.75 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 646.74038    5.70118  113.44   <2e-16 ***
## MaxTemp       2.26264    0.08668   26.11   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 99.79 on 4016 degrees of freedom
## Multiple R-squared:  0.1451, Adjusted R-squared:  0.1449 
## F-statistic: 681.5 on 1 and 4016 DF,  p-value: < 2.2e-16
summary(vlm)
## 
## Call:
## lm(formula = CrimeCount ~ MaxTemp, data = v_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -97.557 -15.495  -0.562  15.047  95.097 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 123.74613    1.30622   94.74   <2e-16 ***
## MaxTemp       0.72275    0.01986   36.40   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.86 on 4016 degrees of freedom
## Multiple R-squared:  0.248,  Adjusted R-squared:  0.2478 
## F-statistic:  1325 on 1 and 4016 DF,  p-value: < 2.2e-16

The p-values are all very low, owning to how much data there is. The R-squared Adjusted is …. well, temperature explains some of the variance. More so for Violations, given how tight the distribution of violations are. Hm… Perhaps that’s a function of the capacity for the police to write the violations?

Now, we check the residuals of the linear models.

# Residuals plots
plot(f_df$MaxTemp, resid(flm), 
     main="Residual Plot, Felonies vs. Max Temp", 
     xlab="Maximum Temperature", 
     ylab="Residuals of Crime Count", 
     col="#F8766D",
     abline(0,0))

plot(m_df$MaxTemp, resid(mlm), 
     main="Residual Plot, Misdemeanors vs. Max Temp", 
     xlab="Maximum Temperature", 
     ylab="Residuals of Crime Count", 
     col="#00BA38",
     abline(0,0))

plot(v_df$MaxTemp, resid(vlm), 
     main="Residual Plot, Violations vs. Max Temp", 
     xlab="Maximum Temperature", 
     ylab="Residuals of Crime Count", 
     col="#619CFF",
     abline(0,0))

Crime Monthly Trend check

# start by extending Anita's Time Series analyses
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year
## The following object is masked from 'package:base':
## 
##     date
# compare to Anita's Monthly graph
crime_time <-crime_df %>% filter(year(DateStart)>2005) %>%
  group_by(Date=floor_date(DateStart, "month"),Level) %>% summarize(count=n())
ggplot(crime_time, aes(Date,count, color=Level))+ geom_line() +
  ggtitle("Trend/Rate of Crimes in Each Category Across year - sampled month-wise")

#Check trend of months
crime_time$month <- month(crime_time$Date)
crime_time$monthabb <- as.factor(month.abb[crime_time$month])
levels(crime_time$monthabb) <-c(month.abb)

ggplot(crime_time, aes(Date,count, color=Level))+ geom_line() + facet_wrap(~monthabb) +
  ggtitle("Trend/Rate of Crimes in Each Category Across year - sampled month-wise")

This is sort of interesting.